1… 10
11.a A movie should appear in the dataset at least 18 times. Each has a record for the weekend (Friday, Saturday and Sunday) from the opening weekend to at least 6 weekends later (for the ones kept). The ones dropped were not in theaters for more than 6 weekends.
11.b
#keeping films that aren't dropped
films_used <- films |>
filter(dropped != 1)
11.c
# day when 12 Rounds came in
round_12_date <- as.Date("2009-03-27")
# Define the number of days to add
days_before <- 17984 #number under 12 Rounds "date" column
# Days prior to the
reference_date <- round_12_date - days_before
# Print the new date
print(reference_date)
## [1] "1959-12-31"
11.d
films_used_d <- films_used |>
mutate(movie_date = as.Date(reference_date + date)) |>
#putting the release_date in the 4th column
select(title, production_budget, release_yr,
movie_date, sat_date, everything())
films_used_d[, c("title", "movie_date")]
11.e
#first using sat_date to get the date for each saturday
films_used_date <- films_used_d |>
#getting the day for saturday
mutate(sat_day = reference_date + sat_date) |>
#sanity check... check for days that aren't Sat...It's all friday actually
mutate(sat_day_of_week = wday(sat_day, label = TRUE)) |>
mutate(
fri_dummy = ifelse(movie_date == sat_day, 1, 0),
sat_dummy = ifelse(movie_date == sat_day + 1, 1, 0),
#reasoning... there was no movie released on Sunday....
thu_dummy = ifelse(movie_date == sat_day - 1, 1, 0)
) |>
#renaming it friday since its dates are fridays
rename(fri_day = sat_day) |>
mutate(sat_day = fri_day + 1) #making a saturday variable
films_used_date[, c("title", "movie_date","fri_day"
,"fri_dummy", "sat_dummy", "thu_dummy")]
11.f
#creating dummies for week using fastDummies
films_used_date <- films_used_date |>
arrange(title, fri_day) |>
group_by(title) |>
# Assign numeric labels to unique elements of sat_date within each title
mutate(week = as.integer(factor(sat_date)))
#Now using fast dummies...
films_used_date <- dummy_cols(films_used_date, select_columns = 'week')
films_used_date[, c("title", "movie_date" ,"week_1", "week_2", "week_3")]
11.g
#using the "Fast Dummies" library... to automatically create dummies for year
film <- dummy_cols(films_used_date, select_columns = 'release_yr')
film[, c("title", "release_yr", "release_yr_2009", "release_yr_2010")]
11.h
#combine the weekends
temp <- film |>
mutate(weekend = case_when(
sat_dummy == 1 ~ "Saturday",
fri_dummy == 1 ~ "Friday",
thu_dummy == 1 ~ "Thursday"
)) |>
group_by(week, weekend) |>
summarize(mean = mean(tickets, na.rm = TRUE))
temp |>
ggplot(aes(x = week, y = mean, color = as.factor(weekend))) +
geom_point() +
geom_line() +
scale_color_manual(values = c("Saturday" = "#4682B4",
"Friday" = "red",
"Thursday" = "#8B008B")) +
labs(color = "Weekend",
y = "Tickets",
x = "Week") +
scale_x_continuous(breaks = scales::pretty_breaks(n = 6)) + # Set x-axis ticks
scale_y_continuous(breaks = scales::pretty_breaks(n = 6)) + # Set y-axis ticks
theme_bw()
NOT NEEDED
#subset colnames that have the hh in them
holiday <- str_subset(colnames(film), "hh")
#make the things in holiday "add"
holiday_dummy <- str_c(holiday, collapse = " + ")
#day of the week dummies
weekend_dummy <- str_c(str_subset(colnames(film), "dummy"), collapse = " + ")
#week of the year dummies
week_dummy <- str_c(str_subset(colnames(film), "week_"), collapse = " + ")
#year of the week dummy
year_dummy <- str_c(str_subset(colnames(film), "release_yr_"), collapse = " + ")
#combine
mod1 <- glue("tickets ~ {weekend_dummy} + {week_dummy} + {year_dummy} + {holiday_dummy}")
#fit a regression model
reg_mod1 <- lm(as.formula(mod1), data = film)
film <- film |>
mutate(pred_tickets = predict(reg_mod1, film)) |>
mutate(abnormal_viewership = tickets - pred_tickets)
film[, c("tickets","pred_tickets", "abnormal_viewership", "fri_day")]
weather <- read_dta("data/weather_collapsed_all.dta")
#adding www to the column names
original_cols <- colnames(weather)
# adding prefix using the paste
colnames(weather) <- paste("www", original_cols, sep = "_")
weather
weather_film <- film |>
left_join(weather,
#combine on dates, automatically filters out dates that don't match
by = c("movie_date" = "www_sat_date"))
weather_film |>
select(contains('movie_date'), contains("www"))